In [ ]:
import pandas as pd
# Test
df = pd.DataFrame({"Yes": [30, 21], "No": [12, 20]}, index=["Poll 1", "Poll 2"])
print(df)
Yes No Poll 1 30 12 Poll 2 21 20
In [ ]:
# Reading in salary data from kaggle on Software Engineer positions
salaries = pd.read_csv("Software Engineer Salaries.csv")
salaries.shape
salaries.Company
# is the same as salaries["Company"]
# Drop all rows were an entry is NaN
salaries.dropna().shape
salaries = salaries.dropna()
In [ ]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
print(salaries.head())
# How many different companies are contained in the dataset?
print(salaries.nunique(axis=0)["Company"])
# How many different job titles are contained in the dataset?
print(salaries.nunique(axis=0)["Job Title"])
# What is the distribution of scores? Add the mean as a vertical line.
fig = px.histogram(salaries, x="Company Score")
fig.add_vline(x=salaries["Company Score"].mean())
Company Company Score \
0 ViewSoft 4.8
1 Workiva 4.3
2 Garmin International, Inc. 3.9
3 Snapchat 3.5
4 Vitesco Technologies Group AG 3.1
Job Title Location Date \
0 Software Engineer Manassas, VA 8d
1 Software Support Engineer Remote 2d
2 C# Software Engineer Cary, NC 2d
3 Software Engineer, Fullstack, 1+ Years of Expe... Los Angeles, CA 2d
4 Software Engineer Seguin, TX 2d
Salary
0 $68K - $94K (Glassdoor est.)
1 $61K - $104K (Employer est.)
2 $95K - $118K (Glassdoor est.)
3 $97K - $145K (Employer est.)
4 $85K - $108K (Glassdoor est.)
578
468
In [ ]:
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
# What is the score of each company? And sort from best to worst by resetting index 0,1,2,3,... and drop index column - save under new variable scores
# What is the minimum and maximum salary per positon? Data cleaning requiered since salary column contains string + needs to differentiate for case where only one value is given
split_salary = salaries.Salary.apply(lambda x: pd.Series(x.split("-")))
split_salary = split_salary.apply(lambda x: x.str.extract(r"(\d+)", expand=False))
split_salary = split_salary.apply(pd.to_numeric)
split_salary = split_salary.rename(columns={0: "Minimum", 1: "Maximum"})
split_salary["Maximum"] = split_salary["Maximum"].astype(float)
split_salary["Minimum"] = split_salary["Minimum"].astype(float)
split_salary["Maximum"] = np.where(pd.isnull(split_salary["Maximum"]), split_salary["Minimum"], split_salary["Maximum"])
split_salary = pd.concat([salaries["Company"], salaries["Company Score"], salaries["Job Title"], split_salary], axis=1)
px.scatter(x=split_salary["Company Score"], y=split_salary["Minimum"], trendline="ols")
px.scatter(x=split_salary["Company Score"], y=split_salary["Maximum"], trendline="lowess")
# There seems to be no correlation between how well companies are scored and how high their salaries are.
In [ ]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
# We could ask ourselves whether salaries differ substantially between different states.
split_location = salaries.Location.apply(lambda x: pd.Series(x.split(",")))
split_location = split_location.rename(columns={0: "City", 1: "State"})
split_location = pd.concat([split_salary, split_location], axis=1)
split_location = split_location.dropna()
px.box(x=split_location["State"], y=split_location["Maximum"])
px.box(x=split_location["State"], y=split_location["Minimum"])
average_salaries_per_state = pd.DataFrame(split_location.groupby("State")["Minimum"].mean()).reset_index()
px.choropleth(locations=average_salaries_per_state["State"].str.strip(), locationmode="USA-states", color=average_salaries_per_state["Minimum"], scope="usa")
In [ ]:
import statsmodels.api as sm
import pandas as pd
# Can we model minimum salary based on US state?
split_location_encoded = pd.get_dummies(split_location, columns=["State"])
Y = split_location_encoded["Minimum"]
X = split_location_encoded.drop(["Minimum", "Maximum", "Company", "Company Score", "Job Title", "City"], axis=1)
X = X.astype(int)
X = sm.add_constant(X)
# Conduct a multiple linear regression analysis
model = sm.OLS(Y, X).fit()
print(model.summary())
# > Looks like there is no state that significantly predicts salary coefficient added to the model intercept
OLS Regression Results
==============================================================================
Dep. Variable: Minimum R-squared: 0.055
Model: OLS Adj. R-squared: -0.013
Method: Least Squares F-statistic: 0.8039
Date: Fri, 16 Aug 2024 Prob (F-statistic): 0.811
Time: 13:52:55 Log-Likelihood: -3216.5
No. Observations: 637 AIC: 6521.
Df Residuals: 593 BIC: 6717.
Df Model: 43
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 101.0912 3.021 33.458 0.000 95.157 107.025
State_ AL 12.4088 19.352 0.641 0.522 -25.598 50.416
State_ AR -8.5912 27.201 -0.316 0.752 -62.013 44.830
State_ AZ -8.0912 11.916 -0.679 0.497 -31.494 15.312
State_ CA 1.8117 4.829 0.375 0.708 -7.672 11.296
State_ CO 0.2588 9.067 0.029 0.977 -17.548 18.065
State_ CT 5.3373 14.762 0.362 0.718 -23.655 34.329
State_ DC 16.0088 12.461 1.285 0.199 -8.464 40.482
State_ DE -11.7579 22.278 -0.528 0.598 -55.511 31.995
State_ FL -1.7579 7.954 -0.221 0.825 -17.378 13.863
State_ GA -1.4391 8.525 -0.169 0.866 -18.182 15.303
State_ HI 33.9088 38.349 0.884 0.377 -41.407 109.225
State_ IA -7.8412 19.352 -0.405 0.685 -45.848 30.166
State_ ID -7.5912 19.352 -0.392 0.695 -45.598 30.416
State_ IL -6.3956 8.525 -0.750 0.453 -23.138 10.347
State_ IN 11.2421 22.278 0.505 0.614 -32.511 54.995
State_ KS 36.9088 19.352 1.907 0.057 -1.098 74.916
State_ KY 53.4088 27.201 1.964 0.050 -0.013 106.830
State_ MA 7.6655 6.973 1.099 0.272 -6.030 21.361
State_ MD 6.0827 5.505 1.105 0.270 -4.730 16.895
State_ MI -1.0386 9.276 -0.112 0.911 -19.257 17.180
State_ MN 23.3532 13.096 1.783 0.075 -2.368 49.074
State_ MO -12.0912 38.349 -0.315 0.753 -87.407 63.225
State_ MS -34.4246 22.278 -1.545 0.123 -78.177 9.328
State_ NC 3.8179 11.916 0.320 0.749 -19.585 27.221
State_ NE 5.9088 27.201 0.217 0.828 -47.513 59.330
State_ NH 13.9088 27.201 0.511 0.609 -39.513 67.330
State_ NJ 8.3298 9.276 0.898 0.370 -9.889 26.548
State_ NM -31.5912 27.201 -1.161 0.246 -85.013 21.830
State_ NY -7.5528 8.083 -0.934 0.350 -23.428 8.323
State_ OH 10.5451 11.916 0.885 0.377 -12.858 33.948
State_ OK 11.2421 22.278 0.505 0.614 -32.511 54.995
State_ OR 1.1088 17.362 0.064 0.949 -32.989 35.207
State_ PA 17.7323 9.752 1.818 0.070 -1.420 36.885
State_ PR 5.9088 27.201 0.217 0.828 -47.513 59.330
State_ RI -2.5912 27.201 -0.095 0.924 -56.013 50.830
State_ SC -23.4246 22.278 -1.051 0.293 -67.177 20.328
State_ TN -29.0912 38.349 -0.759 0.448 -104.407 46.225
State_ TX 3.9564 6.628 0.597 0.551 -9.060 16.973
State_ UT 10.9088 15.897 0.686 0.493 -20.312 42.130
State_ VA -10.6047 6.973 -1.521 0.129 -24.300 3.091
State_ VT -17.0912 22.278 -0.767 0.443 -60.844 26.661
State_ WA 6.9326 6.628 1.046 0.296 -6.084 19.949
State_ WI -3.5458 11.916 -0.298 0.766 -26.949 19.857
State_ WV 18.9088 38.349 0.493 0.622 -56.407 94.225
==============================================================================
Omnibus: 220.217 Durbin-Watson: 1.924
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1002.020
Skew: 1.509 Prob(JB): 2.59e-218
Kurtosis: 8.352 Cond. No. 1.16e+15
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 5.07e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [ ]:
salaries.loc[salaries["Job Title"] == "Software Engineer",]
split_salary.loc[(split_salary["Job Title"] == "Software Engineer") & (split_salary["Minimum"] > 40),]
split_location.loc[split_location.State.isin([" AR"]),]
# Whats the position with the highest minimum salary per company?
split_salary.groupby(["Company"]).apply(lambda split_salary: split_salary.loc[split_salary.Minimum.idxmax()])
# > Here, per group the index (row number) of the highest minimum salary position is used to return the row of that group , then all rows for each group are returend in one dataframe containing company names only once
# How many different listings posted on different dates for each Job title were posted per company?
num_listings_per_position = salaries.groupby(["Company", "Job Title"]).Date.agg([len])
#print(num_listings_per_position)
# This pd DataFrame uses multi index:
mi = num_listings_per_position.index
type(mi)
num_listings_per_position.sort_values(by="len", ascending=False)
# > Microsoft posted 3 different Software Engineer positions in the past few days, Boeing has posted 2... etc...
# We can also sort by the index (Company and Job Title first letter)
num_listings_per_position.sort_index()
C:\Users\klosed\AppData\Local\Temp\ipykernel_20556\2452321493.py:6: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
Out[ ]:
| len | ||
|---|---|---|
| Company | Job Title | |
| 3 Reasons Consulting | Full Stack Software Engineer | 1 |
| 360care | Sr Associate Software Engineer, Database as a Service | 1 |
| 3R Info | Software Engineer | 2 |
| 7-Eleven, Inc. | Software Engineer | 1 |
| 9th networks Inc | Software Engineer IV | 1 |
| ... | ... | ... |
| f5 | Senior Software Engineer | 1 |
| Software Engineer III | 1 | |
| iconectiv, LLC. | Full Stack Software Engineer | 1 |
| kasmo cloud solutions | Software Engineers | 1 |
| thatgamecompany | Senior Software Engineer | 1 |
738 rows × 1 columns
In [ ]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
titles = pd.DataFrame(salaries["Job Title"].value_counts()).reset_index()
titles_dict = titles.set_index("Job Title")["count"].to_dict()
fig = px.bar(titles["count"], hover_name=titles["Job Title"])
fig.show()
# Suppose we would like to add the count of each job title to the original data frame
salaries["Count"] = salaries["Job Title"].map(titles_dict)
print(salaries)
Company Company Score \
0 ViewSoft 4.8
1 Workiva 4.3
2 Garmin International, Inc. 3.9
3 Snapchat 3.5
4 Vitesco Technologies Group AG 3.1
.. ... ...
759 OpenAI 3.1
760 NCR Atleos 3.9
761 Electronic Warfare Associates 3.8
762 BillGO, Inc. 3.3
763 Genesys 3.6
Job Title Location \
0 Software Engineer Manassas, VA
1 Software Support Engineer Remote
2 C# Software Engineer Cary, NC
3 Software Engineer, Fullstack, 1+ Years of Expe... Los Angeles, CA
4 Software Engineer Seguin, TX
.. ... ...
759 Apprentice ETL Software Engineer Elkridge, MD
760 Software Engineer, Tool & Infrastructure, Reli... Palo Alto, CA
761 Software Engineer, Engineering Acceleration San Francisco, CA
762 ATM Software Engineer (Level III) Frisco, TX
763 Software Engineer Mount Laurel, NJ
Date Salary Count
0 8d $68K - $94K (Glassdoor est.) 139
1 2d $61K - $104K (Employer est.) 2
2 2d $95K - $118K (Glassdoor est.) 2
3 2d $97K - $145K (Employer est.) 1
4 2d $85K - $108K (Glassdoor est.) 139
.. ... ... ...
759 30d+ $160K - $385K (Employer est.) 1
760 1d $222K (Employer est.) 1
761 5d $82K - $118K (Glassdoor est.) 1
762 30d+ $69K - $107K (Glassdoor est.) 1
763 30d+ $95K - $143K (Employer est.) 139
[753 rows x 7 columns]